clear all
capture log close
program drop _all
set more off
snapshot erase _all
sysdir set PLUS "M:\Ado\Plus"

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** File name: 		Prepare P's and L's.do
*** Last updated: 	2/2/2016
***
*** This file reads in the raw p's and l's data from PovcalNet and LIS, then
***		1. Estimates the parameters of the GQ and Beta models of the Lorenz curve for surveys for which we have <100 p's and l's
***		2. Determines the best model to use for each survey based on 1) validity and 2) goodness of fit
***		3. Uses the best model (GQ or Beta) to calculate 100 fitted p's and l's for surveys where we have <100 p's and l's
***		4. Combines the urban and rural income distributions into a national income distribution for countries with urban and rural p's and l's
*** 	5. Saves the prepared p's and l's data:
***		   Output Data/Cleaned P's and L's.dta
********************************************************************************************
********************************************************************************************
********************************************************************************************

cd "$directory"

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 1. Estimate the parameters of the GQ and Beta models of the Lorenz curve for surveys for which we have <100 p's and l's
***		a. Read in the data
***		b. Generate inputs to the Lorenz curve models
***		c. Use regressions to estimate the Lorenz curve parameters
********************************************************************************************
********************************************************************************************
********************************************************************************************

*****************************************************************
*****************************************************************
*** a. Read in the data
*****************************************************************
*****************************************************************

**************************************
*** Read in the PovcalNet & LIS p's and l's
**************************************
use "Input Data/Raw P's and L's.dta", clear

**************************************
*** Keep only surveys with fewer than 100 p's and l's
**************************************
drop if p == . | l == .
bys iso3c year urbrur surveytype: gen num_p = _N
drop if num_p >= 100
drop num_p

**************************************
*** Create a unique identifier for each survey
**************************************
egen survey_id = group(iso3c year urbrur surveytype)

*****************************************************************
*****************************************************************
*** b. Generate inputs to the Lorenz curve models
*****************************************************************
*****************************************************************

**************************************
*** Create parameters for the GQ model of the Lorenz curve
**************************************
gen depgq = l*(1-l)
gen a = (p^2)-l
gen b = l*(p-1)
gen c	= p-l
label variable depgq "Dependent Variable for GQ Model"
foreach parm in "a" "b" "c" {
	label variable `parm' "`parm' Parameter for GW Model"
}
**************************************
*** Create parameters for the Beta model of the Lorenz curve
*** Note that the model given as l = p - theta * p^gamma * (1-p)^delta
*** can be transformed into ln(-l + p) = ln(theta) + gamma*ln(p) + delta*ln(1-p)
**************************************
gen depbeta = ln(p-l)
gen gamma = ln(p)
gen delta = ln(1-p)
label variable depbeta "Dependent Variable for Beta Model"
foreach parm in "gamma" "delta" {
	label variable `parm' "`parm' Parameter for Beta Model"
}

*****************************************************************
*****************************************************************
*** c. Use regressions to estimate the Lorenz curve parameters
*****************************************************************
*****************************************************************

**************************************
*** Estimate GQ Parameters
**************************************

*** Set up blank tempfile to store results
preserve
keep survey_id countryname iso3c year surveytype urbrur survey_mean na_mean 
duplicates drop
foreach parm in a b c {
	gen `parm' = .
	label variable `parm' "`parm' Parameter for GQ Model"
}
tempfile gq
save `gq'.dta, replace
restore

*** Run regression for each survey
quietly levelsof survey_id, local(surveys)
foreach survey of local surveys {

	preserve
	
	*** Keep only data from this survey
	keep if survey_id == `survey'
	
	*** Run regression
	reg depgq a b c, noconstant 
	
	*** Get identifying variables
	foreach var in iso3c countryname surveytype year urbrur  {
		local `var'  = `var'[1]
	}
	
	*** Save results in tempfile
	use `gq'.dta, clear
	foreach parm in a b c {
		replace `parm' = _b[`parm'] if survey_id == `survey'
	}
	tempfile gq
	save `gq'.dta, replace
	
	restore

}

**************************************
*** Estimate Beta Parameters
**************************************

*** Set up blank tempfile to store results
preserve
keep survey_id countryname iso3c year surveytype urbrur survey_mean na_mean 
duplicates drop
foreach parm in gamma delta theta {
	gen `parm' = .
	label variable `parm' "`parm' Parameter for Beta Model"
}
tempfile beta
save `beta'.dta, replace
restore

*** Run regression for each survey
levelsof survey_id, local(surveys)
foreach survey of local surveys {

	preserve
	
	*** Keep only data from this survey
	keep if survey_id == `survey'
	
	*** Run regression
	reg depbeta gamma delta if p != 1
	
	*** Get identifying variables
	foreach var in iso3c countryname surveytype year {
		local `var'  = `var'[1]
	}
	
	*** Save results in tempfile
	use `beta'.dta, clear
	foreach parm in gamma delta {
		replace `parm' = _b[`parm'] if survey_id == `survey'
	}
	replace theta = exp(_b[_cons]) if survey_id == `survey'
	tempfile beta
	save `beta'.dta, replace
	
	restore

}

**************************************
*** c. Combine results from both models
**************************************
use `gq'.dta, clear
merge 1:1 survey_id iso3c countryname year surveytype survey_mean na_mean using `beta'.dta, assert(3) nogen norep

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 2. Determine the best model to use for each survey based on 1) validity and 2) goodness of fit
***		a. Conduct validity checks on the results of the GQ and Beta model estimates 
***		   Note: See Datt 1998 for details on the reason for each validity check
***		b. Calculate the sum of squared errors (SSE) for each model
***		c. Mark the best model based on 1) which model is valid and 2) which model has the lowest SSE 
********************************************************************************************
********************************************************************************************
********************************************************************************************

*****************************************************************
*****************************************************************
*** a. Conduct validity checks on the results of the GQ and Beta model estimates 
*****************************************************************
*****************************************************************

**************************************
*** GQ validity checks
**************************************

*** Create necessary variables
gen e = -(a + b + c + 1)
label variable e "e Parameter for GQ model (= -(a + b + c + 1))"
gen m = (b^2) - (4*a)
label variable m "m Parameter for GQ model (= (b^2) - (4*a))"
gen n = 2*b*e - 4*c
label variable n "n Parameter for GQ model (= 2*b*e - 4*c)"
gen r = (n^2 - 4*m*(e^2))^(1/2)
label variable r "r Parameter for GQ model (= (n^2 - 4*m*e^2)^(1/2))"

*** Condition 1: 	L(0; pi) = 0
*** Satisfied if:	e < 0
gen gq_v1 = (e < 0)
label variable gq_v1 "GQ Validity Check 1"

*** Condition 2:	L(1; pi) = 1
*** Satisfied if:	a + c >= 1
*** Note from Datt 1998: "Small violatoins of the second condition, for example L(1; pi) = 0.99, need not be worrying from the point of view of poverty measurement, because the latter depends on the tracking of the Lorenz curve up to the head-count index only.
gen gq_v2 = (a + c >= .98)
label variable gq_v2 "GQ Validity Check 2"

*** Condition 3:	L'(0+, pi) >= 0
*** Satisfied if: 	c >= 0
gen gq_v3 = (c >= 0)
label variable gq_v3 "GQ Validity Check 3"

*** Condition 4:	L''(p, pi) >= 0 for p in (0, 1)
*** Satisfied if: 	m < 0 OR
***					0 < m < (n^2)/(4e^2) and n >=0 OR
***					0 < m < -n/2 and m < (n^2)/(4e^2)
gen gq_v4 = ((m < 0) | ///
			(m > 0 & m < (n^2)/(4*e^2) & n >= 0) | ///
			(m > 0 & m < (n^2)/(4*e^2) & m < -n/2))
label variable gq_v4 "GQ Validity Check 4"

*** Mark cases for which the GQ model is valid
gen gq_valid = (gq_v1 + gq_v2 + gq_v3 + gq_v4 == 4)
label variable gq_valid "GQ Model is Valid"
*** Check if this was marked correctly
preserve
contract gq_v* 
sort gq_valid
list, ab(20) sepby(gq_valid)
restore

**************************************
*** Beta validity checks
**************************************

*** Condition 1: 	L(0; pi) = 0
*** Satisfied automatically by the functional form 
gen beta_v1 = 1

*** Condition 2:	L(1; pi) = 1
*** Satisfied automatically by the functional form 
gen beta_v2 = 1

*** Condition 3:	L'(0+, pi) >= 0
*** Satisfied if: 	L'(0.001; theta, gamma, delta) >= 0
local p = 0.001
gen beta_v3 = ((1 - theta * (`p' ^ gamma) * ((1 - `p') ^ delta) * (gamma / `p' - delta / (1 - `p'))) >= 0)
label variable beta_v3 "Beta Validity Check 3"

*** Condition 4:	L''(p, pi) >= 0 for p in (0, 1)
*** Satisfied if:	L''(p; theta, gamma, delta) >=  0 for p in (0.01.... 0.99)
gen beta_v4 = 1
forvalues p = 0.01(0.01)0.99 {
	quietly replace beta_v4 = 0 if theta * (`p' ^ gamma) * ((1 - `p') ^ delta) * (((gamma * (1 - gamma)) / (`p' ^ 2)) + ((2 * gamma * delta) / (`p' * (1 - `p'))) + ((delta * (1 - gamma)) / (1 - `p') ^ 2)) < 0 
}  

*** Mark cases for which the Beta model is valid
gen beta_valid = (beta_v1 + beta_v2 + beta_v3 + beta_v4 == 4)
label variable beta_valid "Beta Model is Valid"
*** Check if this was marked correctly
preserve
contract beta_v* 
sort beta_valid
list, ab(20) sepby(beta_valid)
restore

**************************************
*** Mark the preferred model based on the validity checks
*** Note: Following WB, we will use the GQ model if neither model is valid
**************************************

label define preference 1 "Both Models Valid" 2 "Beta Model Valid" 3 "GQ Model Valid OR Neither Model Valid"
gen preference = 1 if beta_valid == 1 & gq_valid == 1
replace preference = 2 if beta_valid == 1 & gq_valid == 0
replace preference = 3 if (beta_valid == 0 & gq_valid == 1) | (beta_valid == 0 & gq_valid == 0)
assert preference < .
label values preference preference
label variable preference "Model Preference Based on Validity)

*** Check if the preference was marked correctly
preserve
contract preference *_valid
list, ab(20) sepby(preference)
restore

*** Keep only necessary variables
order countryname-theta e m n r gq_valid beta_valid preference
keep countryname-theta e m n r gq_valid beta_valid preference

*****************************************************************
*****************************************************************
*** b. Calculate the sum of squared errors (SSE) for each model
*****************************************************************
*****************************************************************

**************************************
*** Merge in raw p's and l's
**************************************
merge 1:m year iso3c surveytype urbrur using "Input Data/Raw P's and L's.dta", assert(2 3) keep(3) nogen norep

**************************************
*** Calculate sum of squared errors for each model
**************************************

*** Calculate predicted value of l using the gq model
gen l_pred_gq = (-1/2)*(b*p+ e + (m*p^2+ n*p + e^2)^(1/2))

*** Calculate predicted value of l using the beta model
gen l_pred_beta = p-(theta*(p^gamma)*((1-p)^delta))

*** Calculate the sum of squared errors 
foreach model in gq beta {
	gen sq_error_`model' = (l - l_pred_`model')^2
	bys year iso3c surveytype urbrur: egen sse_`model' = total(sq_error_`model'), missing
	label variable sse_`model' "Sum of Squared Errors for `model' Model"
}

drop l_pred_gq l_pred_beta sq_error_gq sq_error_beta

*****************************************************************
*****************************************************************
*** c. Mark the best model based on 1) which model is valid and 2) which model has the lowest SSE 
*****************************************************************
*****************************************************************
label list preference

label define best_model 1 "GQ" 2 "Beta"

gen best_model = 1 if preference == 3
replace best_model = 2 if preference == 2 
replace best_model = 1 if preference == 1 & sse_gq < sse_beta
replace best_model = 2 if preference == 1 & sse_beta < sse_gq
label values best_model best_model

preserve
contract year iso3c surveytype urbrur best_model
isid year iso3c surveytype urbrur
tab best_model, m
restore

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 3. Use the best model (GQ or Beta) to calculate 100 fitted p's and l's for surveys where we have <100 p's and l's
***		a. Create 100 predicted p's and l's based on the best Lorenz curve model for surveys where we have <100 p's and l's
***		b. Combine the origial values for surveys with >= 100 original p's and l's with these new predicted values for cases with <100 original p's and l's 
********************************************************************************************
********************************************************************************************
********************************************************************************************

*****************************************************************
*****************************************************************
*** a. Create 100 predicted p's and l's based on the best Lorenz curve model
*****************************************************************
*****************************************************************

**************************************
*** Keep data at survey level
**************************************
drop p l
duplicates drop
isid year iso3c surveytype urbrur

**************************************
*** Create 100 observations per survey
**************************************
expand (100)
bys year iso3c surveytype urbrur: gen p = _n/100

**************************************
*** Calculate predicted values of l using the best model
**************************************
label list best_model

*** Calculate predicted value of l using the gq model
gen l = (-1/2)*(b*p+ e + (m*p^2+ n*p + e^2)^(1/2)) if best_model == 1
*** Fix cases where l exceeds 1 (this is possible with the GQ model)
replace l = 1 if p == 1 & l > 1 & l < .

*** Calculate predicted value of l using the beta model
replace l = p-(theta*(p^gamma)*((1-p)^delta))  if best_model == 2

assert p <= 1 & l <= 1

*****************************************************************
*****************************************************************
*** b. Combine the origial values for surveys with >= 100 original p's and l's with these new predicted values for cases with <100 original p's and l's 
*****************************************************************
*****************************************************************

**************************************
*** Save tempfile with necessary variables
**************************************
keep countryname iso3c region surveytype urbrur year p l src survey_mean na_mean
tempfile pred_pl 
save `pred_pl'.dta, replace

**************************************
*** Read in original p's and l's
**************************************
use "Input Data/Raw P's and L's.dta", clear

*** Drop cases with fewer than 100 p's and l's 
bys countryname year urbrur surveytype: gen num_p = _N
drop if num_p < 100
drop num_p

**************************************
*** Append our predicted values for the cases we just dropped
**************************************
append using `pred_pl'.dta

sort countryname surveytype urbrur year p

********************************************************************************************
********************************************************************************************
********************************************************************************************
***	4. Combines the urban and rural income distributions into a national income distribution for countries with urban and rural p's and l's
***		a. Merge in population data
***		b. Create a single combined distribution for cases where we have urban and rural surveys
***		c. Mark any remaining urban or rural surveys as national surveys if they cover at least 85% of the population; otherwise, drop them
********************************************************************************************
********************************************************************************************
********************************************************************************************

*****************************************************************
*****************************************************************
*** a. Merge in population data
*****************************************************************
*****************************************************************

**************************************
*** Merge in urban/rural/total population, based on the survey type
**************************************
merge m:1 countryname year urbrur using "Input Data/Population Data.dta", assert(2 3) keep(3) nogen norep

**************************************
*** Merge in national population, regardless of survey type
**************************************
preserve
use "Input Data/Population Data.dta", clear
keep if urbrur == 3
rename pop_urbrur pop
label variable pop "Population (National)"
tempfile natl_pop
save `natl_pop'.dta, replace
restore

merge m:1 countryname year using `natl_pop'.dta, assert(2 3) keep(3) nogen norep

*****************************************************************
*****************************************************************
*** b. Create a single combined distribution for cases where we have urban and rural surveys
*****************************************************************
*****************************************************************

**************************************
*** First, check if we have BOTH an urban and a rural survey of the same type (income/consumption) in cases where we have either
**************************************

*** Mark if we have an urban or a rural survey for each country in each year
gen has_rural_temp = (urbrur == 1)
bys countryname year surveytype: egen has_rural = max(has_rural_temp)
gen has_urban_temp = (urbrur == 2)
bys countryname year surveytype: egen has_urban = max(has_urban_temp)

*** Show cases where we have an urban survey or a rural survey, but not both
preserve
keep if (has_rural == 1 & has_urban != 1) | (has_rural != 1 & has_urban == 1)
contract countryname year surveytype urbrur
count	// 44 cases
list countryname year surveytype urbrur, sepby(countryname) ab(20)
restore

**************************************
*** Calculate the actual population and mean income of that population for each percentile
**************************************

*** Add a zeroth p and l (necessary for calculations)
sort countryname year surveytype urbrur p 
by countryname year surveytype urbrur: gen num = _n 
expand (2) if num == 1 & (has_rural == 1 & has_urban == 1), gen(new)
replace p = 0 if new == 1
replace l = 0 if new == 1
sort countryname year surveytype urbrur p

**** Calculate actual population for each p
gen pop_p = (p-p[_n-1]) * pop_urbrur if (has_rural == 1 & has_urban == 1)

*** Calculate mean income for each p
by countryname year surveytype urbrur: gen mean_income_p = ((l-l[_n-1]) * (pop_urbrur * survey_mean)) / pop_p if (has_rural == 1 & has_urban == 1)
by countryname year surveytype urbrur: assert mean_income_p > mean_income_p[_n-1] if (has_rural == 1 & has_urban == 1) & _n > 2 & survey_mean < .

**************************************
*** Calculate new national p's and l's
**************************************
drop if new == 1 

*** Sort by mean income within each country, regardlesss of whether the data is from the urban or rural survey
sort countryname year surveytype mean_income_p 

*** Calculate the cumulative share of population (p) and cumulative share of income/consumption (l) for the entire country
by countryname year surveytype: gen pop_p_cum = sum(pop_p) if (has_rural == 1 & has_urban == 1)
gen p_natl = pop_p_cum/pop if (has_rural == 1 & has_urban == 1)
gen income_p = mean_income_p * pop_p if (has_rural == 1 & has_urban == 1)
by countryname year surveytype:  gen income_p_cum = sum(income_p) if (has_rural == 1 & has_urban == 1)
by countryname year surveytype: egen income_total = total(income_p)if (has_rural == 1 & has_urban == 1)
gen l_natl = income_p_cum/income_total if (has_rural == 1 & has_urban == 1)

assert p_natl <= 1 if p_natl < .
assert l_natl <= 1 if l_natl < .

*** Update p's and l's using our new national values
replace p = p_natl if (has_rural == 1 & has_urban == 1)
replace l = l_natl if (has_rural == 1 & has_urban == 1)

**************************************
*** Update survey mean income as the national survey mean (population-weighted mean of the urban and rural surveys)
**************************************
foreach var of varlist survey_mean pop_urbrur {
	gen `var'_rural_temp = `var' if (has_rural == 1 & has_urban == 1) & urbrur == 1
	bys countryname year surveytype: egen `var'_rural = max(`var'_rural_temp)
	gen `var'_urban_temp = `var' if (has_rural == 1 & has_urban == 1) & urbrur == 2
	bys countryname year surveytype: egen `var'_urban = max(`var'_urban_temp)
}
replace survey_mean = survey_mean_urban * (pop_urbrur_urban/pop) + survey_mean_rural*(pop_urbrur_rural/pop) if (has_rural == 1 & has_urban == 1)

**************************************
*** Re-mark these cases as national surveys 
**************************************
replace urbrur = 3 if (has_rural == 1 & has_urban == 1)
replace pop_urbrur = pop if (has_rural == 1 & has_urban == 1)

*** Drop variables we no longer need
drop has_rural_temp has_rural has_urban_temp has_urban num new pop_p mean_income_p pop_p_cum p_natl income_p income_p_cum income_total l_natl survey_mean_* pop_urbrur_*

*****************************************************************
*****************************************************************
***	c. Mark any remaining urban or rural surveys as national surveys if they cover at least 85% of the population; otherwise, drop them
*****************************************************************
*****************************************************************

**************************************
*** Identify cases where we have only an urban or rural survey, and the survey does not cover at least 85% of the population
**************************************
gen pop_pct = pop_urbrur/pop*100
assert pop_pct >= 99 if urbrur == 3
preserve
contract countryname iso3c year surveytype urbrur pop_pct if urbrur != 3
gsort -pop_pct
format pop_pct %10.2fc
list countryname year surveytype urbrur pop_pct if pop_pct <85, sep(100) noobs ab(20)
restore

**************************************
*** Drop cases where we have only an urban or rural survey, and the survey does not cover at least 85% of the population
**************************************
drop if pop_pct < 85
drop pop_urbrur pop_pct

********************************************************************************************
********************************************************************************************
********************************************************************************************
*** 5. Save the data
********************************************************************************************
********************************************************************************************
********************************************************************************************

save "Output Data/Cleaned P's and L's.dta", replace
